library(scales) # v 1.0.0
# counts_RNA <- read.table('../Mapping/output/counts_RNA_untreated_uaRNAscreens.txt',stringsAsFactors=F)
# counts_RNA_2and3 <- read.table('../Mapping/output/counts_RNA_untreated_uaRNAscreens_2and3.txt',stringsAsFactors=F)
# counts_RNA <- cbind('totalRNA_1'=counts_RNA[,1],counts_RNA_2and3,counts_RNA[,4:6])
# write.table(counts_RNA,file="../Mapping/output/counts_RNA_untreated_uaRNAscreens_all.txt",sep="\t")
counts_RNA <- read.table('../Mapping/output/counts_RNA_untreated_uaRNAscreens_all.txt',stringsAsFactors=F)

counts_RNA_scaled <- as.data.frame(scale(counts_RNA[-nrow(counts_RNA),],
                                         colSums(counts_RNA), center=F))

counts_gDNA <- read.table('../Mapping/output/counts_gDNA_forRNA_untreated_uaRNAscreens.txt',stringsAsFactors=F)
counts_gDNA_scaled <- as.data.frame(scale(counts_gDNA[-nrow(counts_gDNA),],
                                          colSums(counts_gDNA), center=F))

Put unspliced and spliced versions in side-by-side columns, calculate the totals

RNA_perinsert <- as.data.frame(matrix(0,nrow=nrow(counts_gDNA_scaled),ncol=ncol(counts_RNA_scaled)*6))
colnames(RNA_perinsert) <-
          paste(rep(colnames(counts_RNA_scaled),each=6),
                c('unspliced',paste0('spliced',1:3),'splicedtotal','total'),sep="_")
rownames(RNA_perinsert) <- rownames(counts_gDNA_scaled)
RNA_perinsert[,paste(colnames(counts_RNA_scaled),'unspliced',sep="_")] <-
  counts_RNA_scaled[1:nrow(counts_gDNA_scaled),]

for(sample in colnames(counts_RNA_scaled)) {
    for(i in 1:3){
        RNA_perinsert[,paste0(sample,'_spliced',i)] <-
            sapply(rownames(RNA_perinsert),function(x){
                    counts_RNA_scaled[paste0(x,'_spliced',i),sample]
        })
    }
  RNA_perinsert[,paste0(sample,'_splicedtotal')] <- 
    rowSums(RNA_perinsert[,grep(paste0(sample,'_spliced.$'),colnames(RNA_perinsert))],na.rm=T)
  RNA_perinsert[,paste0(sample,'_total')] <-
    rowSums(RNA_perinsert[,c(paste0(sample,'_unspliced'),paste0(sample,'_splicedtotal'))],na.rm=T)
}

Normalize to gDNA

# These are thresholds I settled on to balance improving correlation between experiments and not losing too much data.
thresholds <- c("gDNA_1"=1e-5,"gDNA_2"=2e-6,"gDNA_3"=2e-6,"gDNA_4"=2e-6,"gDNA_5"=2e-6) 

RNA_over_gDNA <- RNA_perinsert
for(experiment in c('_1','_2','_3','_4','_5')) {
    RNA_over_gDNA[,grep(paste0(experiment,"_"),colnames(RNA_over_gDNA))] <-
      RNA_over_gDNA[,grep(paste0(experiment,"_"),colnames(RNA_over_gDNA))] /
                               counts_gDNA_scaled[,grep(paste0(experiment,"$"),colnames(counts_gDNA_scaled))]
    RNA_over_gDNA[counts_gDNA_scaled[,grep(paste0(experiment,"$"),colnames(counts_gDNA_scaled))] <
                    thresholds[grep(paste0(experiment,"$"),names(thresholds))],
                  grep(paste0(experiment,"_"),colnames(RNA_over_gDNA))] <- NA
}

plot(log(RNA_over_gDNA$totalRNA_2_total,10),log(RNA_over_gDNA$totalRNA_1_total,10),pch=16,col=alpha(1,.2),cex=.8,
     xlab="total-RNA exp2 (log10)",ylab="total-RNA exp1 (log10)")

plot(log(RNA_over_gDNA$totalRNA_2_total,10),log(RNA_over_gDNA$totalRNA_3_total,10),pch=16,col=alpha(1,.2),cex=.8,
     xlab="total-RNA exp2 (log10)",ylab="total-RNA exp3 (log10)")

plot(log(RNA_over_gDNA$totalRNA_2_total,10),log(RNA_over_gDNA$totalRNA_4_total,10),pch=16,col=alpha(1,.2),cex=.8,
     xlab="total-RNA exp2 (log10)",ylab="total-RNA exp4 (log10)")

plot(log(RNA_over_gDNA$chrRNA_2_total,10),log(RNA_over_gDNA$chrRNA_3_total,10),pch=16,col=alpha(1,.2),cex=.8,
     xlab="chr-RNA exp2 (log10)",ylab="chr-RNA exp3 (log10)")

plot(log(RNA_over_gDNA$runon_4_total,10),log(RNA_over_gDNA$runon_5_total,10),pch=16,col=alpha(1,.2),cex=.8,
     xlab="runon-RNA exp4 (log10)",ylab="runon-RNA exp5 (log10)")

Set median of randomized controls to 1

load('input/HV20200429_full_library.RData')
controls <- full_library$unique_id[full_library$category=="randomized_control"]

RNA_over_ctrl <- RNA_over_gDNA
for(sample in colnames(counts_RNA_scaled)){
  RNA_over_ctrl[,grep(sample,colnames(RNA_over_ctrl))] <- 
    RNA_over_gDNA[,grep(sample,colnames(RNA_over_gDNA))] / 
      median(RNA_over_gDNA[controls, paste(sample,'total',sep="_")],na.rm=T)
}

Calculate splicing efficiencies

splicingefficiencies <- data.frame("totalRNA_1_splicingefficiency"=
                                     RNA_over_ctrl$totalRNA_1_splicedtotal / RNA_over_ctrl$totalRNA_1_total,
                                   "totalRNA_2_splicingefficiency"=
                                     RNA_over_ctrl$totalRNA_2_splicedtotal / RNA_over_ctrl$totalRNA_2_total,
                                   "totalRNA_3_splicingefficiency"=
                                     RNA_over_ctrl$totalRNA_3_splicedtotal / RNA_over_ctrl$totalRNA_3_total,
                                   "totalRNA_4_splicingefficiency"=
                                     RNA_over_ctrl$totalRNA_4_splicedtotal / RNA_over_ctrl$totalRNA_4_total,
                                   "chrRNA_2_splicingefficiency"=
                                     RNA_over_ctrl$chrRNA_2_splicedtotal / RNA_over_ctrl$chrRNA_2_total,
                                   "chrRNA_3_splicingefficiency"=
                                     RNA_over_ctrl$chrRNA_3_splicedtotal / RNA_over_ctrl$chrRNA_3_total,
                                   "runonRNA_4_splicingefficiency"=
                                     RNA_over_ctrl$runon_4_splicedtotal / RNA_over_ctrl$runon_4_total,
                                   "runonRNA_5_splicingefficiency"=
                                     RNA_over_ctrl$runon_5_splicedtotal / RNA_over_ctrl$runon_5_total)
RNA_over_ctrl <- cbind(RNA_over_ctrl,splicingefficiencies)

Correlations between replicates after all normalizations

cor(RNA_over_ctrl[grep('_._total',colnames(RNA_over_ctrl))],use="complete.obs",method="spearman")
##                  totalRNA_1_total chrRNA_2_total totalRNA_2_total
## totalRNA_1_total        1.0000000      0.4451689        0.6808857
## chrRNA_2_total          0.4451689      1.0000000        0.6478330
## totalRNA_2_total        0.6808857      0.6478330        1.0000000
## chrRNA_3_total          0.4750332      0.4647187        0.5417640
## totalRNA_3_total        0.6532091      0.4478035        0.7050558
## runon_4_total           0.4728122      0.3727739        0.4729614
## runon_5_total           0.3061248      0.3248173        0.3697498
## totalRNA_4_total        0.7355930      0.4776267        0.7327673
##                  chrRNA_3_total totalRNA_3_total runon_4_total runon_5_total
## totalRNA_1_total      0.4750332        0.6532091     0.4728122     0.3061248
## chrRNA_2_total        0.4647187        0.4478035     0.3727739     0.3248173
## totalRNA_2_total      0.5417640        0.7050558     0.4729614     0.3697498
## chrRNA_3_total        1.0000000        0.6560775     0.3776363     0.3043534
## totalRNA_3_total      0.6560775        1.0000000     0.4238523     0.3181772
## runon_4_total         0.3776363        0.4238523     1.0000000     0.3789335
## runon_5_total         0.3043534        0.3181772     0.3789335     1.0000000
## totalRNA_4_total      0.4988937        0.6992744     0.6235121     0.3692230
##                  totalRNA_4_total
## totalRNA_1_total        0.7355930
## chrRNA_2_total          0.4776267
## totalRNA_2_total        0.7327673
## chrRNA_3_total          0.4988937
## totalRNA_3_total        0.6992744
## runon_4_total           0.6235121
## runon_5_total           0.3692230
## totalRNA_4_total        1.0000000

Add pseudocounts to everything with < 5e-3 normalized counts

RNA_over_ctrl_pseudo <- RNA_over_ctrl
RNA_over_ctrl_pseudo[,grep('_total',colnames(RNA_over_ctrl_pseudo))][
  RNA_over_ctrl_pseudo[,grep('_total',colnames(RNA_over_ctrl_pseudo))]< 5e-3] <- 5e-3

Plots after all normalizations and adding pseudocounts

plot(log(RNA_over_ctrl_pseudo$totalRNA_1_total,2),
     log(RNA_over_ctrl_pseudo$totalRNA_2_total,2),
     pch=16,cex=0.6,col=alpha(1,.3),xlim=c(-6.5,6),ylim=c(-6.5,6),
     main="Total-RNA: 1 vs 2",xlab="Total-RNA repl 1 (log2)",ylab="Total-RNA repl 2 (log2)")

plot(log(RNA_over_ctrl_pseudo$totalRNA_2_total,2),
     log(RNA_over_ctrl_pseudo$totalRNA_3_total,2),
     pch=16,cex=0.6,col=alpha(1,.3),xlim=c(-6.5,6),ylim=c(-6.5,6),
     main="Total-RNA: 2 vs 3",xlab="Total-RNA repl 2 (log2)",ylab="Total-RNA repl 3 (log2)")

plot(log(RNA_over_ctrl_pseudo$totalRNA_2_total,2),
     log(RNA_over_ctrl_pseudo$totalRNA_4_total,2),
     pch=16,cex=0.6,col=alpha(1,.3),xlim=c(-6.5,6),ylim=c(-6.5,6.5),
     main="Total-RNA: 2 vs 4",xlab="Total-RNA repl 2 (log2)",ylab="Total-RNA repl 4 (log2)")

plot(log(RNA_over_ctrl_pseudo$chrRNA_2_total,2),
     log(RNA_over_ctrl_pseudo$chrRNA_3_total,2),
     pch=16,cex=0.6,col=alpha(1,.3),
     main="chr-RNA: 2 vs 3",xlab="Chr-RNA replicate 1 (exp2) (log2)",ylab="Chr-RNA replicate 2 (exp3) (log2)")

plot(log(RNA_over_ctrl_pseudo$runon_4_total,2),
     log(RNA_over_ctrl_pseudo$runon_5_total,2),
     pch=16,cex=0.6,col=alpha(1,.3),
     main="Two run-on replicates",xlab="Run-on 4 (log2)",ylab="Run-on 5 (log2)")

Average replicates. Divide by median of controls again after averaging, so that the controls are really set to 1. Calculate correlations. Then add pseudocounts and make plots

RNA_over_ctrl_totals <- RNA_over_ctrl[,grep('_total|splicing',colnames(RNA_over_ctrl))]
RNA_over_ctrl_averages <- RNA_over_ctrl_totals
RNA_over_ctrl_averages$totalRNA_average <-
    rowMeans(RNA_over_ctrl_averages[,grep('totalRNA_._total',colnames(RNA_over_ctrl_averages))],na.rm=T)
RNA_over_ctrl_averages$totalRNA_average <- RNA_over_ctrl_averages$totalRNA_average /
                                      median(RNA_over_ctrl_averages[controls,"totalRNA_average"],na.rm=T)
RNA_over_ctrl_averages$chrRNA_average <-
    rowMeans(RNA_over_ctrl_averages[,grep('chrRNA_._total',colnames(RNA_over_ctrl_averages))],na.rm=T)
RNA_over_ctrl_averages$chrRNA_average <- RNA_over_ctrl_averages$chrRNA_average /
                                      median(RNA_over_ctrl_averages[controls,"chrRNA_average"],na.rm=T)
RNA_over_ctrl_averages$runonRNA_average <-
          rowMeans(RNA_over_ctrl_averages[,grep('runon_._total',colnames(RNA_over_ctrl_averages))],na.rm=T)
RNA_over_ctrl_averages$runonRNA_average <- RNA_over_ctrl_averages$runonRNA_average /
                                     median(RNA_over_ctrl_averages[controls,"runonRNA_average"],na.rm=T)
RNA_over_ctrl_averages$splicingeff_average <-
  rowMeans(RNA_over_ctrl_averages[,grep('totalRNA_._splicingefficiency',colnames(RNA_over_ctrl_averages))],na.rm=T)
RNA_over_ctrl_averages$splicingeff_chr_average <-
  rowMeans(RNA_over_ctrl_averages[,grep('chrRNA_._splicingefficiency',colnames(RNA_over_ctrl_averages))],na.rm=T)
RNA_over_ctrl_averages$splicingeff_runon_average <-
  rowMeans(RNA_over_ctrl_averages[,grep('runonRNA_._splicingefficiency',colnames(RNA_over_ctrl_averages))],na.rm=T)
RNA_over_ctrl_averages <- RNA_over_ctrl_averages[,grep('average',colnames(RNA_over_ctrl_averages))]

# Correlations
cor(RNA_over_ctrl_averages$totalRNA_average,RNA_over_ctrl_averages$chrRNA_average,use="complete.obs",method="spearman")
## [1] 0.6455142
cor(RNA_over_ctrl_averages$totalRNA_average,RNA_over_ctrl_averages$runonRNA_average,use="complete.obs",method="spearman")
## [1] 0.5508029
cor(RNA_over_ctrl_averages$chrRNA_average,RNA_over_ctrl_averages$runonRNA_average,use="complete.obs",method="spearman")
## [1] 0.4433317
RNA_over_ctrl_averages_pseudo <- RNA_over_ctrl_averages
RNA_over_ctrl_averages_pseudo[,grep('RNA',colnames(RNA_over_ctrl_averages))][
                    RNA_over_ctrl_averages[,grep('RNA',colnames(RNA_over_ctrl_averages))]< 5e-3] <- 5e-3

plot(log(RNA_over_ctrl_averages_pseudo$chrRNA_average,10),
     log(RNA_over_ctrl_averages_pseudo$totalRNA_average,10),
     pch=16,cex=0.6,col=alpha(1,.15),xlim=c(-2.5,2),ylim=c(-2.5,2),
     main="Chr-RNA vs Total-RNA",xlab="Chr-RNA average (log10)",ylab="Total-RNA average (log10)")

plot(log(RNA_over_ctrl_averages_pseudo$runonRNA_average,10),
     log(RNA_over_ctrl_averages_pseudo$totalRNA_average,10),
     pch=16,cex=0.6,col=alpha(1,.15),xlim=c(-2.5,2),ylim=c(-2.5,2),
     main="Run-on-RNA vs Total-RNA",xlab="Run-on-RNA average (log10)",ylab="Total-RNA average (log10)")

plot(log(RNA_over_ctrl_averages_pseudo$runonRNA_average,10),
     log(RNA_over_ctrl_averages_pseudo$chrRNA_average,10),
     pch=16,cex=0.6,col=alpha(1,.15),xlim=c(-2.5,2),ylim=c(-2.5,2),
     main="Run-on-RNA vs Chr-RNA",xlab="Run-on-RNA average (log10)",ylab="Chr-RNA average (log10)")

Save

save(RNA_over_ctrl,RNA_over_ctrl_pseudo,file="output/normalized_RNA_untreated_uaRNAscreens.RData")
save(RNA_over_ctrl_averages,file="output/normalized_RNA_averagedtotals_untreated_uaRNAscreens.RData")
save(RNA_over_ctrl_totals,file="output/normalized_RNA_totals_untreated_uaRNAscreens.RData")